In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
HouseDF=pd.read_csv("kc_house_data.csv")
In [4]:
HouseDF.head()
Out[4]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
0 7129300520 20141013T000000 221900.0 3 1.00 1180 5650 1.0 0 0 ... 7 1180 0 1955 0 98178 47.5112 -122.257 1340 5650
1 6414100192 20141209T000000 538000.0 3 2.25 2570 7242 2.0 0 0 ... 7 2170 400 1951 1991 98125 47.7210 -122.319 1690 7639
2 5631500400 20150225T000000 180000.0 2 1.00 770 10000 1.0 0 0 ... 6 770 0 1933 0 98028 47.7379 -122.233 2720 8062
3 2487200875 20141209T000000 604000.0 4 3.00 1960 5000 1.0 0 0 ... 7 1050 910 1965 0 98136 47.5208 -122.393 1360 5000
4 1954400510 20150218T000000 510000.0 3 2.00 1680 8080 1.0 0 0 ... 8 1680 0 1987 0 98074 47.6168 -122.045 1800 7503

5 rows × 21 columns

In [5]:
HouseDF.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long           21613 non-null  float64
 19  sqft_living15  21613 non-null  int64  
 20  sqft_lot15     21613 non-null  int64  
dtypes: float64(5), int64(15), object(1)
memory usage: 3.5+ MB
In [5]:
HouseDF.describe()
Out[5]:
id price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
count 2.161300e+04 2.161300e+04 21613.000000 21613.000000 21613.000000 2.161300e+04 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000
mean 4.580302e+09 5.400881e+05 3.370842 2.114757 2079.899736 1.510697e+04 1.494309 0.007542 0.234303 3.409430 7.656873 1788.390691 291.509045 1971.005136 84.402258 98077.939805 47.560053 -122.213896 1986.552492 12768.455652
std 2.876566e+09 3.671272e+05 0.930062 0.770163 918.440897 4.142051e+04 0.539989 0.086517 0.766318 0.650743 1.175459 828.090978 442.575043 29.373411 401.679240 53.505026 0.138564 0.140828 685.391304 27304.179631
min 1.000102e+06 7.500000e+04 0.000000 0.000000 290.000000 5.200000e+02 1.000000 0.000000 0.000000 1.000000 1.000000 290.000000 0.000000 1900.000000 0.000000 98001.000000 47.155900 -122.519000 399.000000 651.000000
25% 2.123049e+09 3.219500e+05 3.000000 1.750000 1427.000000 5.040000e+03 1.000000 0.000000 0.000000 3.000000 7.000000 1190.000000 0.000000 1951.000000 0.000000 98033.000000 47.471000 -122.328000 1490.000000 5100.000000
50% 3.904930e+09 4.500000e+05 3.000000 2.250000 1910.000000 7.618000e+03 1.500000 0.000000 0.000000 3.000000 7.000000 1560.000000 0.000000 1975.000000 0.000000 98065.000000 47.571800 -122.230000 1840.000000 7620.000000
75% 7.308900e+09 6.450000e+05 4.000000 2.500000 2550.000000 1.068800e+04 2.000000 0.000000 0.000000 4.000000 8.000000 2210.000000 560.000000 1997.000000 0.000000 98118.000000 47.678000 -122.125000 2360.000000 10083.000000
max 9.900000e+09 7.700000e+06 33.000000 8.000000 13540.000000 1.651359e+06 3.500000 1.000000 4.000000 5.000000 13.000000 9410.000000 4820.000000 2015.000000 2015.000000 98199.000000 47.777600 -121.315000 6210.000000 871200.000000
In [6]:
HouseDF.columns()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_16108/2907648296.py in <module>
----> 1 HouseDF.columns()

TypeError: 'Index' object is not callable
In [7]:
HouseDF.columns
Out[7]:
Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')
In [9]:
sns.pairplot(HouseDF)
Out[9]:
<seaborn.axisgrid.PairGrid at 0x24458cb9ca0>
In [10]:
sns.heatmap(HouseDF.corr(),annot=True)
Out[10]:
<AxesSubplot:>
In [8]:
X=HouseDF[['bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15']]
Y=HouseDF['price']
In [9]:
from sklearn.model_selection import train_test_split
In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.40, random_state=101)
In [15]:
X_train
Out[15]:
bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
2435 3 2.25 2550 9674 1.0 0 0 3 7 1850 700 1959 0 98178 47.4856 -122.247 2240 9674
256 4 1.75 2360 7620 1.0 0 0 4 7 1180 1180 1955 0 98106 47.5278 -122.345 1910 7620
13911 3 1.75 1770 2800 1.5 0 0 3 7 1770 0 1914 0 98103 47.6631 -122.357 1630 3254
1135 7 3.00 2940 8624 1.0 0 0 3 8 1690 1250 1977 0 98155 47.7555 -122.307 1850 8031
12181 4 2.50 2210 7079 2.0 0 0 3 8 2210 0 1993 0 98031 47.4206 -122.183 1970 7000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5695 3 2.25 1920 9672 2.0 0 0 4 8 1920 0 1984 0 98074 47.6233 -122.046 1950 10125
8006 3 1.00 1240 3600 1.5 0 0 3 7 1240 0 1902 0 98144 47.5986 -122.298 1680 3600
17745 3 2.25 1780 7332 2.0 0 0 3 7 1780 0 1987 0 98038 47.3593 -122.051 1510 7625
17931 2 1.00 1150 5000 1.0 0 0 4 7 1050 100 1924 0 98115 47.6846 -122.317 1463 4320
13151 3 1.00 1450 7930 1.0 0 0 4 6 1150 300 1923 0 98126 47.5212 -122.371 1040 7740

12967 rows × 18 columns

In [16]:
from sklearn.linear_model import LinearRegression
In [17]:
lm=LinearRegression()
In [18]:
lm.fit(X_train,Y_train)
Out[18]:
LinearRegression()
In [20]:
coeff_df=pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
In [21]:
coeff_df
Out[21]:
Coefficient
bedrooms -36819.464411
bathrooms 35835.150371
sqft_living 112.835604
sqft_lot 0.162778
floors 9277.688561
waterfront 562834.274157
view 52300.123926
condition 27570.424376
grade 97072.844829
sqft_above 73.458807
sqft_basement 39.376796
yr_built -2636.599217
yr_renovated 18.202834
zipcode -595.883695
lat 608465.676716
long -221497.479203
sqft_living15 18.071370
sqft_lot15 -0.379984
In [22]:
prediction=lm.predict(X_test)lz
In [24]:
plt.scatter(Y_test,prediction)
Out[24]:
<matplotlib.collections.PathCollection at 0x2447ecb1a00>
In [29]:
sns.histplot((Y_test-prediction),bins=50);
In [30]:
lm.score(X_test,Y_test)
Out[30]:
0.7058600293775569
In [31]:
from sklearn import ensemble
clf = ensemble.GradientBoostingRegressor(n_estimators = 400, max_depth = 5, min_samples_split = 2,
          learning_rate = 0.1, loss = 'ls')
In [32]:
clf.fit(X_train, Y_train)
Out[32]:
GradientBoostingRegressor(max_depth=5, n_estimators=400)
In [33]:
clf.score(X_test,Y_test)
Out[33]:
0.8904225008916121
In [34]:
prediction = lm.predict(X_test)
In [35]:
g=plt.plot((Y_test-prediction),marker='o',linestyle='')
In [36]:
lm=LinearRegression(fit_intercept=False)
lm.fit(X_train,Y_train)
prediction=lm.predict(X_test)
g=plt.plot((Y_test-prediction),marker='o',linestyle='')
In [37]:
lm.score(X_test,Y_test)
Out[37]:
0.7057973136589795
In [38]:
prediction = clf.predict(X_test)
In [39]:
g=plt.plot((Y_test-prediction),marker='o',linestyle='')
In [40]:
clf = ensemble.GradientBoostingRegressor(n_estimators = 401, max_depth = 5, min_samples_split = 2,
          learning_rate = 0.1, loss = 'ls')
In [41]:
clf.fit(X_train, Y_train)
Out[41]:
GradientBoostingRegressor(max_depth=5, n_estimators=401)
In [42]:
clf.score(X_test,Y_test)
Out[42]:
0.886942189822708
In [43]:
prediction = clf.predict(X_test)
In [44]:
plt.scatter(Y_test,prediction)
Out[44]:
<matplotlib.collections.PathCollection at 0x2440194fac0>
In [45]:
sns.distplot((Y_test-prediction),bins=50);
C:\Users\shyam\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
In [46]:
sns.histplot((Y_test-prediction),bins=50);
In [47]:
sns.displot((Y_test-prediction),bins=50);
In [11]:
from sklearn.linear_model import Lasso
ls=Lasso()
ls.fit(X_train,Y_train)
C:\Users\shyam\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py:530: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 243429488812860.1, tolerance: 175091918878.2423
  model = cd_fast.enet_coordinate_descent(
Out[11]:
Lasso()
In [12]:
ls.score(X_test,Y_test)
Out[12]:
0.7058597556855026
In [13]:
prediction=ls.predict(X_test)
In [15]:
from sklearn import metrics
In [16]:
score=metrics.r2_score(Y_test,prediction)
print("R Squared Error:",score)
R Squared Error: 0.7058597556855026
In [17]:
g=plt.plot((Y_test-prediction),marker='o',linestyle='')
In [18]:
from sklearn import ensemble
clf = ensemble.GradientBoostingRegressor(n_estimators = 390, max_depth = 5, min_samples_split = 2,
          learning_rate = 0.1, loss = 'ls')
clf.fit(X_train, Y_train)
clf.score(X_test,Y_test)
Out[18]:
0.8907139918590289
In [19]:
prediction = clf.predict(X_test)
In [20]:
score=metrics.r2_score(Y_test,prediction)
print("R Squared Error:",score)
R Squared Error: 0.8907139918590289
In [21]:
g=plt.plot((Y_test-prediction),marker='o',linestyle='')
In [ ]: